qui {
noisily {
	/*************************************************/
	/************* Step 1.7. Openness ****************/
	/*************************************************/

/*

To gain panel data on the evolution of trade openness, our strategy is the following:
	- Collect data on import & export values from various sources
	- Clean the data
	- Define trade openness as (imports + exports)/GDP
	- Linearly interpolate missing values in all data sources
	- Select a base series, the most complete dataset
	- Use the additional sources, to collect a panel data set covering the entire period under consideration:
	
		* IF THE MISSING VALUES ARE RELATED TO A COUNTRY ALREADY APPEARING IN BASELINE DATA:
		-> Use the evolution in openness in this selected dataset to complete missings in the baseline data, eg.
			open_it = (open_selected_it/open_selected_it-1)*open_baseline_it-1 if open_it is missing
			open_it = (open_selected_it/open_selected_it+1)*open_baseline_it+1 if open_it is missing
			
		IF THE MISSING VALUES ARE RELATED TO A COUNTRY NOT COVERED IN BASELINE DATA:
		-> approximate by using the value of trade openness in the alternative dataset
			open_it = open_selected_it if open_it is missing
			
	- Proceed until panel covers entire time period under consideration
	- To further complete the panel: estimate a third-order polynomial in trade openness reported in each alternative data source, year and region dummies to approximate missing data
	- Use the implied growth rates in these estimated trajectories to maximally extend the baseline data forward and backwards.
	- Result = composite index of trade openness, expressed in (imports + exports)/GDP

*/

}
 
/* 1.7.1. Load data */
cd ..
cd ".\3. Intermediary results"
merge 1:1 year cntrycode_COW_T using COW_T
drop if _merge == 2
drop _merge
merge 1:1 year cntrycode_PWT56 using PWT56, keepusing(open_PWT56)
drop if _merge == 2
drop _merge
merge 1:1 year cntrycode_PWT81 using PWT81, keepusing(open_PWT81)
drop if _merge == 2
drop _merge
merge 1:1 year cntrycode_WDI using WDI, keepusing(imports_WDI exports_WDI)
drop if _merge == 2
drop _merge
merge 1:1 year cntrycode_WDI using WDI_trade, keepusing(import_WDI export_WDI)  // WDI data on total value of imports and exports (expressed in millions of current USD)
drop if _merge == 2
drop _merge

/* 1.7.2. Clean data */

		* Identify unreliable data points
		// Both WDI & COW data on total import and export values expressed in same units, so should yield the same values
			noi di ""
			noi di "CLEANING PROCEDURE COW TRADE DATA"
			
			noi di "-> FIRST: remove some suspicious data points in COW trade data:"
			
				noi {
				replace import_COW = . if cntrycode_COW == 315 & year >= 1960 & year <= 1963 // Czechoslovakia: discontinuous negative jump, eg. from 7.74e+09 in 1959 to 1.05e+08 in 1960
				replace export_COW = . if cntrycode_COW == 315 & year >= 1960 & year <= 1963 // Czechoslovakia: discontinuous negative jump, eg. from 7.74e+09 in 1959 to 1.05e+08 in 1960
				replace import_COW = . if cntrycode_COW == 290 & year >= 1960 & year <= 1963 // Poland: discontinuous negative jump
				replace export_COW = . if cntrycode_COW == 290 & year >= 1960 & year <= 1963 // Poland: discontinuous negative jump
				replace import_COW = . if cntrycode_COW == 265 & year >= 1960 & year <= 1961 // GDR: discontinuous negative jump
				replace export_COW = . if cntrycode_COW == 265 & year >= 1960 & year <= 1961 // GDR: discontinuous negative jump
				replace import_COW = . if cntrycode_COW == 365 & year >= 1960 & year <= 1962 // Russia: discontinuous negative jump
				replace export_COW = . if cntrycode_COW == 365 & year >= 1960 & year <= 1962 // Russia: discontinuous negative jump
				}
	
			noi di "SECOND: compare COW with trade flows reported in WDI to rectify some apparent computational errors."
			noi di ""
			noi di "Correlation between both data sources:"
			noi corr import_WDI import_COW
			noi corr export_WDI export_COW
			
			* To find suspicious data points, identify largest procentual discrepancies of COW data wrt. the WDI data
			gen import_discrepancy = abs(import_WDI-import_COW)/import_WDI		// How much larger/smaller is the value of total import in the COW data w.r.t. the WDI data?
			gen export_discrepancy = abs(export_WDI-export_COW)/export_WDI		// How much larger/smaller is the value of total export in the COW data w.r.t. the WDI data?
			
			noi di "Average percentage discrepancy between WDI and COW-reported values:"
			noi sum import_discrepancy export_discrepancy
						
			* Look at # of suspicious cases
			noi di ""
			noi di "Looking at # of suspicious cases, corrections made:"
			
				* Bahamas
				noi di ""
				noi di "Case 1: Bahamas"
				noi di "Sharp increase in COW trade flows around independence declaration in 1973."
				noi di "No apparent inconsistencies with WDI data (though lower WDI-values in 1981-1996 period): left as is."
				
				sort cntry year
				noi list cntry year import_COW import_WDI export_COW export_WDI if cntry == "Bahamas" & year >= 1960

				* Ghana
				noi di ""
				noi di "Case 2: Ghana"
				noi di "No apparent inconsistencies with WDI data (though lower COW-values in 1991 period): left as is."
				
				sort cntry year
				noi list cntry year import_COW import_WDI export_COW export_WDI if cntry == "Ghana" & year >= 1950

				*Liberia
				noi di ""
				noi di "Case 3: Liberia"
				noi di "Between 1989-1990: tenfold increase in COW-imports & fivefold increase in COW-exports although war broke out."
				noi di "Between 1997 & 2014: COW-import values consistently larger by a factor of 10."
				noi di "Corrected: import_COW = import_COW/10 between 1990 & 2014"
				
				sort cntry year
				noi list cntry year import_COW import_WDI export_COW export_WDI if cntry == "Liberia" & year >= 1950
				replace import_COW = import_COW/10 if year >= 1990 & year <= 2014 & cntry == "Liberia"

				* St. Lucia
				noi di ""
				noi di "Case 4: St. Lucia"
				noi di "No apparent inconsistencies with WDI data (though lower COW-values in 1991 period): left as is."
				
				sort cntry year
				noi list cntry year import_COW import_WDI export_COW export_WDI if cntry == "St. Lucia" & year >= 1950
				
				* Luxembourg
				noi di ""
				noi di "Case 5: Luxembourg"
				noi di "COW import and exports values consistently larger by a factor of 10 between 1960 & 1996."
				noi di "Corrected: import_COW = import_COW/10 between 1960 & 1996"
				noi di "Corrected: export_COW = export_COW/10 between 1960 & 1996"
				
				sort cntry year
				noi list cntry year import_COW import_WDI export_COW export_WDI if cntry == "Luxembourg" & year >= 1950
				replace import_COW = import_COW/10 if year >= 1960 & year <= 1996 & cntry == "Luxembourg"
				replace export_COW = export_COW/10 if year >= 1960 & year <= 1996 & cntry == "Luxembourg"
				
				* Myanmar
				noi di ""
				noi di "Case 6: Myanmar"
				noi di "No apparent inconsistencies with WDI data (though higher WDI-values in 2000-2012 period): left as is."
				
				sort cntry year
				noi list cntry year import_COW import_WDI export_COW export_WDI if cntry == "Myanmar" & year >= 1950

/* 1.7.4. Express all data sources in identical units (% of GDP) */

gen open_WDI = (imports_WDI+exports_WDI)/100 if exports_WDI != . & imports_WDI != .
replace open_PWT56 = open_PWT56/100
gen open_COW = (export_COW+import_COW)/baseline_GDP if export_COW != . & import_COW != .

/* 1.7.5. Linearly interpolate missing values */
sort cntry year
local source = 0

foreach name in "PWT81" "WDI" "PWT56" "COW" {
	local source = `source'+1
	local source2 = `source'+900
	gen interpolated_open_`name' = 0 if open_`name' != .
	gen open_`name'_raw = open_`name'
	by cntry: ipolate open_`name' year, generate(open_`name'1)
	sum cntrycode if independence_years != . & year >= $startyear & open_`name' == . & open_`name'1 != .
	replace open_`name' = open_`name'1
	drop open_`name'1
	replace interpolated_open_`name' = 1 if interpolated_open_`name' == . & open_`name' != .
	}

/* 1.7.6. Use PWT81 data as baseline */
gen open = open_PWT81
gen source_open = 1 if open != .
scalar correlation_open_1 = 1

	* Identify interpolated parts
	gen interpolated_open = 0 & open != .
	replace interpolated_open = 1 if interpolated_open_PWT81 == 1 & open != .

	* Identify extrapolated parts
	gen extrapolated_open = 0  if open != .

	* identify polynomial predicted parts
	gen polynomialpredicted_open = 0  if open != .
		
	* Show correlation with baseline data
	noi di "Correlation with baseline series (PWT81)"
	noisily corr open_WDI open if year >= $startyear & independence_years != .
	scalar correlation_open_2 = round(`r(rho)',.01)
	noisily corr open_PWT56 open if year >= $startyear & independence_years != .
	noisily corr open_COW open if year >= $startyear & independence_years != .
	scalar correlation_open_4 = round(`r(rho)',.01)

/* 1.7.7. Maximally extend forward and backward by growth rates implied in WDI data */
xtset cntrycode year

foreach dataset in "WDI" {
							
	* Extend forward 
	local i = 999
	while `i' != 0 {
		sum open  if year >= $startyear & independence_years != .
		local original = r(N)
		replace open = (1+(open_`dataset'-L.open_`dataset')/L.open_`dataset')*L.open if open == .
		sum open  if year >= $startyear & independence_years != .
		local extended = r(N)
		local i = `extended' - `original'
		}
									
	* Extend backward
	local i = 999
	while `i' != 0 {
		sum open if year >= $startyear & independence_years != .
		local original = r(N)
		replace open = (1+(open_`dataset'-F.open_`dataset')/F.open_`dataset')*F.open if open == .
		sum open if year >= $startyear & independence_years != .
		local extended = r(N)
		local i = `extended' - `original'
		}
								
				
	* Identify source
	replace source_open = 992 if source_open == . & open != .
		
	* Identify interpolated parts
	replace interpolated_open = 1  if open != . & interpolated_open_`dataset' == 1 & interpolated_open == .
					
	* Identify extrapolated parts
	replace extrapolated_open = 1  if open != . & extrapolated_open == .
		
	* Extend for countries missing in reference data series
	replace open = open_`dataset' if open == . & independence_years != . & year >= $startyear

	* Identify source
	replace source_open = 2 if source_open == . & open != .
	
	* Identify interpolated parts
	replace interpolated_open = 1  if open != . & interpolated_open_`dataset' == 1 & interpolated_open == .
				
	* Identify extrapolated parts
	replace extrapolated_open = 0 if open != . & extrapolated_open == .
		
	* Identify polynomial predicted parts
	replace polynomialpredicted_open = 0 if open != . & polynomialpredicted_open == .
	}
								
/* 1.7.8.   Predict missing country-years using other sources */
	noi di "Calculate predictions based on:", _continue
	
	foreach dataset in "open_PWT56" {
		noi di "`dataset'", _continue

		* Generate indicators:
								
			* Squared and cubic term per capita gdp proxy:
						
				gen `dataset'_sq = `dataset'^2
				gen `dataset'_cu = `dataset'^3
								
			* Year dummies
						
				tab year if year >= $startyear, gen(year_)
						
				local tyear = 2016 - $startyear + 1
				forval i = 1/`tyear' {
					local j = `i' + $startyear - 1
					rename year_`i' year_`j'
					}

			* Region dummies
						
				tab regioncode, gen(region_)

				* Interactions:
									
					* region and year				
				
					forval i = ${startyear}/2016 {
						forval j = 1/7 {
						gen year_`i'Xregion_`j' = year_`i' * region_`j'
						}
					}
								
					* year and per capita gdp proxy
					
					forval i = ${startyear}/2016 {
						gen year_`i'X`dataset' = year_`i' * `dataset'
						gen year_`i'X`dataset'_sq = year_`i' * `dataset' * `dataset'
						}
								
					* region and per capita gdp proxy
					
					forval j = 1/7 {
						gen region_`j'X`dataset' = region_`j' * `dataset'
						gen region_`j'X`dataset'_sq = region_`j' * `dataset' * `dataset'
						}
									
					* year, region and per capita gdp proxy
					
						forval i = ${startyear}/2016 {
							forval j = 1/7 {
								gen year_`i'Xregion_`j'X`dataset' = year_`i' * region_`j' * `dataset'
								}
							}

					* Predictions: 
					
						cap reg open `dataset' `dataset'_sq `dataset'_cu year_* region_* if independence_years != . & year >= $startyear
						local startyear = $startyear
						if _rc != 0 {
							local startyear = `startyear' + 1
							reg open `dataset' `dataset'_sq `dataset'_cu year_* region_* if independence_years != . & year >= `startyear'
							}
						scalar appr_`dataset'_r2 = round(e(r2_a), .001)
						predict appr_`dataset' if independence_years != . & year >= $startyear
						
						predict appr_`dataset'_se if independence_years != . & year >= $startyear, stdp
						local N = e(df_r)
						gen appr_`dataset'_ub = appr_`dataset' + invttail(`N',.005)*appr_`dataset'_se
						gen appr_`dataset'_lb = appr_`dataset' - invttail(`N',.005)*appr_`dataset'_se
						gen predictive_accuracy_`dataset' = .
						replace predictive_accuracy_`dataset' = 1 if open !=. & appr_`dataset' != . & appr_`dataset'_lb <= open & open <= appr_`dataset'_ub
						replace predictive_accuracy_`dataset' = 0 if open !=. & appr_`dataset' != . & appr_`dataset'_lb > open | open !=. & appr_`dataset' != . & open > appr_`dataset'_ub
						replace predictive_accuracy_`dataset' = 0 if open !=. & appr_`dataset' != . & appr_`dataset'_lb > open | open !=. & appr_`dataset' != . & open > appr_`dataset'_ub
						sum predictive_accuracy_`dataset' if drop == 0
						scalar predictive_accuracy_`dataset' = round(r(mean),.0001)
							
						* Drop unnecesaries:
							
						drop year_* region_* `dataset'_sq `dataset'_cu appr_`dataset'_* predictive_*
						}
						
		noi di ""

/* 1.7.9.  Report implications */

noi di "Correlation of predictions with baseline data:"
local count = 3
foreach dataset in "open_PWT56" {
	noi corr open appr_`dataset' if year >= $startyear & independence_years != .
	scalar correlation_open_`count' = round(`r(rho)',.01)
	local count = `count'+1
	}

noi di "Adjusted R squared of regressions"
foreach dataset in "open_PWT56" {
	noi scalar list appr_`dataset'_r2 
	}
	
noi di "Predictive accuracy:"
foreach dataset in "open_PWT56" {
	noi scalar list predictive_accuracy_`dataset'
	}
			
noi di "Approximated trajectories: summary statistics"
noi sum open appr_* if year >= $startyear & independence_years != .

/* 1.7.10. Extend baseline data */
	
noi di "Extend baseline data using:", _continue
	
	* Polynomial prediction (PWT56)
	local source = 2
	foreach dataset in "open_PWT56" {
					
		local source = `source' + 1
		local source2 = `source'+990
		noi di "`dataset'", _continue
			
		* Extend forward 
		local i = 999
		while `i' != 0 {
			sum open if year >= $startyear & independence_years != .
			local original = r(N)
			replace open = (1+(appr_`dataset'-L.appr_`dataset')/L.appr_`dataset')*L.open if open == .
			sum open if year >= $startyear & independence_years != .
			local extended = r(N)
			local i = `extended' - `original'
			}
					
		* Extend backward
		local i = 999
		while `i' != 0 {
			sum open if year >= $startyear & independence_years != .
			local original = r(N)
			replace open = (1+(appr_`dataset'-F.appr_`dataset')/F.appr_`dataset')*F.open if open == .
			sum open if year >= $startyear & independence_years != .
			local extended = r(N)
			local i = `extended' - `original'
			}
					

		* Identify source
		replace source_open = `source2' if source_open == . & open != .
		
		* Extend for countries missing in reference data series
		replace open = appr_`dataset' if open == . & independence_years != . & year >= $startyear

		* Identify source
		replace source_open = `source' if source_open == . & open != .
		
		* Identify interpolated parts
		replace interpolated_open = 0  if open != . & interpolated_open == . & interpolated_`dataset' == 0
		replace interpolated_open = 1  if open != . & interpolated_open == . & interpolated_`dataset' == 1
						
		* Identify extrapolated parts
		replace extrapolated_open = 0 if open != . & extrapolated_open == .
						
		* Identify polynomial predicted parts
		replace polynomialpredicted_open = 1 if open != . & polynomialpredicted_open == .
		}
	
	* Extrapolation (COW)
	foreach dataset in "COW" {
								
		* Extend forward 
		local i = 999
		while `i' != 0 {
			sum open  if year >= $startyear & independence_years != .
			local original = r(N)
			replace open = (1+(open_`dataset'-L.open_`dataset')/L.open_`dataset')*L.open if open == .
			sum open  if year >= $startyear & independence_years != .
			local extended = r(N)
			local i = `extended' - `original'
			}
										
		* Extend backward
		local i = 999
		while `i' != 0 {
			sum open if year >= $startyear & independence_years != .
			local original = r(N)
			replace open = (1+(open_`dataset'-F.open_`dataset')/F.open_`dataset')*F.open if open == .
			sum open if year >= $startyear & independence_years != .
			local extended = r(N)
			local i = `extended' - `original'
			}
									
					
		* Identify source
		replace source_open = 994 if source_open == . & open != .
			
		* Identify interpolated parts
		replace interpolated_open = 1  if open != . & interpolated_open_`dataset' == 1 & interpolated_open == .
						
		* Identify extrapolated parts
		replace extrapolated_open = 1  if open != . & extrapolated_open == .
			
		* Extend for countries missing in reference data series
		replace open = open_`dataset' if open == . & independence_years != . & year >= $startyear

		* Identify source
		replace source_open = 4 if source_open == . & open != .
		
		* Identify interpolated parts
		replace interpolated_open = 1  if open != . & interpolated_open_`dataset' == 1 & interpolated_open == .
					
		* Identify extrapolated parts
		replace extrapolated_open = 0 if open != . & extrapolated_open == .
			
		* Identify polynomial predicted parts
		replace polynomialpredicted_open = 0 if open != . & polynomialpredicted_open == .
		}
		
	* Drop unreliable data points if necessary
	noi di ""
	noi di "Replaced the following negative data points with missings:"
	noi list cntry year open if open != . & open < 0
	replace open = . if open != . & open < 0
	noi di ""
				
/* 1.7.11. Report composition of data */
		
noi di "Baseline data: summary statistics"
noi sum open if year >= $startyear & independence_years != .

				
/* 1.7.12. Drop unnecesaries, add relevant data */
drop appr_* open_* export* import*
gen lopen = ln(open)

/* 1.7.13. Label variables */
label var open "openness (average)"
label var lopen "Log openness"

* Reroute to directory containing dofiles
cd ..
cd ".\1. Dofiles"
}
